import string
from random import random

from pyspark.sql import *
from pyspark.sql.types import *
import pyspark.sql.functions as f

if __name__ == '__main__':
    # spark=SparkSession.builder.appName("test16_udf_define").master("local[*]").getOrCreate()
    # sc=spark.sparkContext

    spark=SparkSession.builder.appName("test1_dataFrame_create")\
        .master("local[*]").getOrCreate()
    sc=spark.sparkContext
    # [] 当作一行
    rdd=sc.parallelize([[1],[2],[3],[4],[5]],3)
    df=spark.createDataFrame(rdd,["num"])
    # 折中的方式就是使用rdd的mapPartitions算子来完成聚合操作
    # 如果mapPartitions API 完成UDAF聚合，一定要单分区
    single_partion_rdd=df.rdd.repartition(1)
    def process(iter:list) -> list:
        sum=0
        for row in iter :
            sum+=row["num"]
        return [sum]

    print(single_partion_rdd.mapPartitions(process).collect())









